# CFI Model ICD10 
# Versioned in Python based upon original SAS code provided by Dae Hyun Kim
# Authored by: Doug Bedell at Westat and Vince Tambellini at VillageMD
# Updated 7/10/2021 to use new ICD-9 and ICD-10 lookup tables

# This code reads in relevant CFI datasets from provided code base,  
# and outputs a CFI Measure for each patient. 

# Load Pandas with alias 'pd' 
import pandas as pd
# Load Numpy with alias 'np'
import numpy as np
# Load bisect for lookup
import bisect

# Regression model intercept becomes default frailty score for those with no
# diagnoses or procedures- derived from trained model
ModelIntercept = 0.10288
# Input/output path for files
WorkDir = '...\\Frailty_ICD10\\'

# Function to perform PX range lookups using binary search.

# Function: Input HCPCS/CPT PX code. Output disease number from lookup
def lookup_pxdisease(px):
# Do not process CPT type II codes ending in F
  if len(px) != 5 or px[4].isalpha():
    return 0
  idx = bisect.bisect_left(stop_list, px)
  if idx < len(stop_list) and start_list[idx] <= px <= stop_list[idx]:
    return px_values[idx]
  else:
    return 0

# Read in all study IDs (patid)
iddata = pd.read_csv(WorkDir + 'ids.txt', sep = '\t')

# Read in ICD-9 DX data (patid, dx)
dx9data = pd.read_csv(WorkDir + 'dx09.txt', sep = '\t')

# Read in ICD-9 DX lookup data.
dx9lookup = pd.read_csv(WorkDir + 'CFI_ICD9CM_V32.csv', sep = ',', comment = '#')

# Read in ICD-10 DX data (patid, dx)
dx10data = pd.read_csv(WorkDir + 'dx10.txt', sep = '\t')

# Read in ICD-10 DX lookup data
dx10lookup = pd.read_csv(WorkDir + 'CFI_ICD10CM_V2020.csv', sep = ',', comment = '#')

# Read in PX data (patid, px)
pxdata = pd.read_csv(WorkDir + 'px.txt', sep = '\t')

# Read in PX lookup data. Implements PX section of SAS codes_org.
pxlookup = pd.read_csv(WorkDir + 'pxlookup.txt', sep = '\t', comment = '#')

# Read in model disease weights lookup
weightlookup = pd.read_csv(WorkDir + 'disease_weight.txt', sep = '\t')

# NOTE: Since we only need to score each DX/PX once per patient, remove duplicates
# prior to any lookup.
# Get disease numbers for ICD-9 diagnoses file
# One-to-one so just merge and fill non-matches (NaN) with 0
dx9data = dx9data.drop_duplicates()
dx9data = dx9data.merge(dx9lookup, on = 'dx', how = 'left').fillna(0)
# Merge automatically upcasts dtype of disease from int64 to float64 due to NaN.
# Change back.
dx9data['disease_number'] = dx9data['disease_number'].astype('int64')

# Get disease numbers for ICD-10 diagnoses file
# One-to-one so just merge and fill non-matches (NaN) with 0
dx10data = dx10data.drop_duplicates()
dx10data = dx10data.merge(dx10lookup, on = 'dx', how = 'left').fillna(0)
# Merge automatically upcasts dtype of disease from int64 to float64 due to NaN.
# Change back.
dx10data['disease_number'] = dx10data['disease_number'].astype('int64')

# Ensure PX lookup data is sorted on 'stop' first.
pxlookup = pxlookup.sort_values(by = ['stop'])
pxlookup = pxlookup.reset_index(drop = True)
start_list = pxlookup['start']
stop_list  = pxlookup['stop']
px_values = pxlookup['disease_number']

# Get disease numbers for procedures file
# Creates a new column named disease_number and then calls the lookup_pxdisease function on each
# HCPCS/CPT code to get the corresponding disease
pxdata = pxdata.drop_duplicates()
pxdata['disease_number'] = pxdata['px'].apply(lookup_pxdisease)

# Assign dummy disease_number = 0 for all study IDs. This will have the effect of assigning the
# default weight (ModelIntercept) for any PatID that is not included in the DX9, DX10 or PX file
# Creates a new column named disease_number and sets to 0
iddata = iddata.drop_duplicates()
iddata['disease_number'] = 0

# Combine the data, keeping only patient ID and disease number
df_list = [dx9data, dx10data, pxdata, iddata]
common_cols = list(set.intersection(*(set(c) for c in df_list)))
diseasedata = pd.concat([df[common_cols] for df in df_list], ignore_index = True)

# Remove duplicates. Each DX/PX should only be weighted once.
diseasedata = diseasedata.drop_duplicates()

# Sort by patient, disease
diseasedatasort = diseasedata.sort_values(by = ['patid','disease_number'])

# Assign weights
# Merge the disease weights on to the disease data and fill non-matches (NaN) with 0
diseasedatasort = diseasedatasort.merge(weightlookup, on = 'disease_number', how = 'left').fillna(0)

# Calculate frailty scores by summing the weights of records grouped by patient ID.
# ModelIntercept value added to every score. Default score for those with no DX/PX.
# NOTE: Floating point precision using sum(). round() used as a quick way around it. Several other ways to handle it.
scores = round(diseasedatasort.groupby(['patid'])['weight'].sum().reset_index() + ModelIntercept, 6)
# Prevent scientific notation of IDs
scores['patid'] = scores['patid'].astype('int64')
# Rename columns so they're identical to the SAS output for quality check, and for standardization
scores.columns = ['patid', 'frailty_index']

# Write to CSV
scores.to_csv(WorkDir + 'frailty_output_python.csv', index = False)
